Here we investigate the outliers of a clustering on gene-expression generated from scRNA-seq data
First we load the with Seurat preprocessed 33k PBMC dataset from 10x Genomics. This data is analyzed using PCA and t-SNE on the variable genes. SNN (Shared nearest neighbour) is used to cluster the data in PC space.
load("../data/seurat_33k_main")
Subset the data to only the four major cell types
pbmc33k.subset <- SubsetData(pbmc33k.merged, ident.use = c("B", "NK", "TH", "TC", "Monocytes"))
TSNEPlot(pbmc33k.subset, do.label = T)
rm(pbmc33k.merged)
It’s fairly strait forward to select outliers inside t-SNE space.
The following function is used to select t-SNE outliers:
tsne.neighbours <- function(data, max.distance = 2) {
pb <- txtProgressBar(min = 0, max = nrow(data), style = 3)
for (i in 1:nrow(data)) {
# Calculate neighbours within max.distance
neighbours <- data$tSNE_1 > (data[i,"tSNE_1"] - max.distance) &
data$tSNE_1 < (data[i,"tSNE_1"] + max.distance) &
data$tSNE_2 > (data[i,"tSNE_2"] - max.distance) &
data$tSNE_2 < (data[i,"tSNE_2"] + max.distance) &
(data$tSNE_1 - data[i,"tSNE_1"])^2 + (data$tSNE_2 - data[i,"tSNE_2"])^2 < max.distance^2
data[i,"neighbours"] <- sum(neighbours) -1 # -1 to exclude current cell
setTxtProgressBar(pb, i)
}
close(pb)
return(data)
}
pbmc33k.tsne.rot <- tsne.neighbours(data = pbmc33k.subset@tsne.rot)
It’s a simple function to determine to amount of neighbours within a certain distance (euclidean). The neighbours of each cell within a radius of 2 are calculated.
If you select the cells that have less then 60 t-SNE neighbours.
plot(pbmc33k.subset@tsne.rot$tSNE_2~pbmc33k.subset@tsne.rot$tSNE_1,
pch=19, cex=0.4, col = ifelse(pbmc33k.tsne.rot$neighbours < 60 ,'red','green'))
Now let’s take a look at the number of gene and reads of the outliers. First, subset the data:
all.cells <- FetchData(pbmc33k.subset, c("nUMI", "nGene", "percent.mito"))
outlier.data <- FetchData(pbmc33k.subset, c("nUMI", "nGene", "percent.mito"),
cells.use = WhichCells(pbmc33k.subset, cells.use = pbmc33k.tsne.rot$neighbours < 20))
Compare the number of genes, mitochondrial gene content and amount of reads to total dataset
par(mfrow=c(1,3))
boxplot(outlier.data$nGene, all.cells$nGene, names = c("Outlier nGene", "nGene total") )
boxplot(outlier.data$nUMI, all.cells$nUMI, names = c("Outlier nUMI", "nUMI total") )
boxplot(outlier.data$percent.mito, all.cells$percent.mito, names = c("Outlier mito %", "mito % total") )
par(mfrow=c(1,1))
Now let’s investigate the data of the Th cells identified by the SNN algorithm and compare the outliers vs the non outliers
th.cell.data <- FetchData(pbmc33k.subset, c("nUMI", "nGene", "percent.mito"),
cells.use = WhichCells(pbmc33k.subset, ident = "TH"))
th.cell.outliers.data <- FetchData(pbmc33k.subset, c("nUMI", "nGene", "percent.mito"),
cells.use = WhichCells(pbmc33k.subset, cells.use = pbmc33k.tsne.rot$neighbours < 20, ident = "TH"))
The same trend is visible for the CD4+ T cells;
par(mfrow=c(1,3))
boxplot(th.cell.outliers.data$nGene, th.cell.data$nGene, names = c("Outlier nGene", "Total"))
boxplot(th.cell.outliers.data$nUMI, th.cell.data$nUMI, names = c("Outlier nUMI", "nUMI total") )
boxplot(th.cell.outliers.data$percent.mito, th.cell.data$percent.mito, names = c("Outlier mito %", "mito % total") )
par(mfrow=c(1,1))
Here CD4+ T cells are subsetted, so that the t-SNE outliers can be selected more accurate.
th.cells <- SubsetData(pbmc33k.subset,
ident.use = "TH" )
th.cells.tsne.rot <- tsne.neighbours(data = th.cells@tsne.rot)
plot(th.cells.tsne.rot$tSNE_2~th.cells.tsne.rot$tSNE_1,
pch=19, cex=0.4, col = ifelse(th.cells.tsne.rot$neighbours < 20 ,'red','green'))
th.cells.outliers <- SubsetData(th.cells,
cells.use = WhichCells(th.cells, cells.use = th.cells.tsne.rot$neighbours < 20))
th.cells.non.outliers <- SubsetData(th.cells,
cells.use = WhichCells(th.cells, cells.use = th.cells.tsne.rot$neighbours >= 20))
th.cells.subset.data.outliers <- FetchData(th.cells.outliers, c("nUMI", "nGene", "percent.mito"))
th.cells.subset.data.non.outliers <- FetchData(th.cells.non.outliers, c("nUMI", "nGene", "percent.mito"))
Now we compare the number of genes en UMI’s for the subsetted data
par(mfrow=c(1,3))
boxplot(th.cells.subset.data.outliers$nGene, th.cells.subset.data.non.outliers$nGene, names = c("Outlier nGene", "Total"))
boxplot(th.cells.subset.data.outliers$nUMI, th.cells.subset.data.non.outliers$nUMI, names = c("Outlier nUMI", "nUMI total") )
boxplot(th.cells.subset.data.outliers$percent.mito, th.cells.subset.data.non.outliers$percent.mito, names = c("Outlier mito %", "mito % total") )
par(mfrow=c(1,1))
The Th cell outliers show an increased expression of MS4A1 (B cell marker)
th.outliers.ms4a1 <- th.cells.outliers@data["MS4A1",]
th.non.outliers.ms4a1 <- th.cells.non.outliers@data["MS4A1",]
# Fraction ms4a1 expressing cells in outliers and mean expression
sum(th.outliers.ms4a1 != 0) / length(th.outliers.ms4a1)
## [1] 0.1641026
mean(th.outliers.ms4a1)
## [1] 0.3334613
# Fraction ms4a1 expressing cells in non-outliers
sum(th.non.outliers.ms4a1 != 0) / length(th.non.outliers.ms4a1)
## [1] 0.02631091
mean(th.non.outliers.ms4a1)
## [1] 0.01604771
FeaturePlot(th.cells, c("MS4A1", "GNLY", "LYZ", "CD8A"), cols.use = c("lightgrey","blue"), nCol = 2)
The outliers do express markers you would expect in other cell types.
So let’s try to use these marker genes to indicate doublets. Cells expressing markers from multiple cluster identities might be multiplets. Because the sub-population of T cells show to many overlapping markers we have to merge the Th and Tc cells.
pbmc33k.subset <- MergeNode(pbmc33k.subset, 9)
BuildClusterTree(pbmc33k.subset, do.reorder = T)
## An object of class seurat in project 10X_PBMC
## 23734 genes across 28943 samples.
levels(pbmc33k.subset@ident) <- c(levels(pbmc33k.subset@ident), "T")
pbmc33k.subset@ident[pbmc33k.subset@ident == "TH" | pbmc33k.subset@ident == "TC"] <- "T"
pbmc33k.subset@ident <- droplevels(pbmc33k.subset@ident) # Drop the unused TH and TC levels
TSNEPlot(pbmc33k.subset)
#save(pbmc33k.subset, file = "data/seurat_33k_t_merged")
Calculate the markers and save the results
#markers <- FindAllMarkers(pbmc33k.subset, do.print = T)
#save(markers, file = "../data/markers_B_mono_NK_T")
load("../data/markers_B_mono_NK_T")
Here we select the top 10 markers of every cluster having an occurrence lower than 10% in all the other clusters
top.markers <- NULL
for (cluster in levels(pbmc33k.subset@ident)) {
# select the first 10 markers where the percentage in the other cluster of that marker is < 10%
cluster.markers <- markers[markers$cluster==cluster & markers$pct.2<0.10,]
top.markers <- rbind(top.markers, cluster.markers[1:10,])
}
top.markers
Calculate the doublet score of every cell by looking at the fraction of markers from it’s own cluster compared to all other clusters
calculate.doublet.score <- function(data, top.markers) {
doublet.scores <- matrix(nrow = length(data@cell.names),
ncol = length(levels(data@ident)) + 1,
dimnames = list(data@cell.names, c("highest.frac", levels(data@ident))))
idents <- levels(data@ident)
pb <- txtProgressBar(min = 0, max = ncol(data@data), style = 3)
for (i in 1:ncol(data@data)) {
cell <- data@data[,i] # vector of gene expression of a cell
for (ident in idents) {
top.markers.ident <- top.markers[top.markers$cluster == ident, "gene"] # vector of gene names of top markers for identity class
cell.top.markers.ident <- cell[top.markers.ident] # vector of expression values of top marker genes of the current cell for the current idenity class
doublet.scores[i, ident] <- sum(cell.top.markers.ident) # sum of expression of top markers genes
}
# Calculate the maximum fraction of expression on the total expression of all the cell type markers
doublet.scores[i,1] <- max(doublet.scores[i,], na.rm = T) / sum(doublet.scores[i,], na.rm = T)
setTxtProgressBar(pb, i)
}
close(pb)
}
#double.scores <- calculate.doublet.score(data = pbmc33k.subset, top.markers = top.markers)
#save(doublet.scores, file = "../data/seurat_33k_t_merged_doublet.rda")
load("../data/seurat_33k_t_merged_doublet.rda")
doublet.scores[,1] <- 1 - doublet.scores[,1] # invert the score
pbmc33k.subset <- AddMetaData(pbmc33k.subset, doublet.scores[,1], "doublet.score")
#save(pbmc33k.subset, file = "../data/seurat_33k_t_merged_with_doublet")
Now let’s investigate the doublet scores.
hist(doublet.scores[,1], breaks = 100)
dim(doublet.scores[is.na(doublet.scores[,"highest.frac"]),]) # 36 cells are NA
## [1] 36 5
plot(pbmc33k.subset@tsne.rot$tSNE_1, pbmc33k.subset@tsne.rot$tSNE_2, cex = 0.2, col = ifelse(is.na(doublet.scores[,"highest.frac"]) ,'red','black'))
doublet.scores[is.na(doublet.scores)] <- 0 # I set the NA's to 0 because they don't show expression of the top-markers of any cell-type.
36 cells express no marker genes at all. We set the doublet score to 0, as most of these cells seem to be in the center of the cluster
pbmc33k.subset <- AddMetaData(pbmc33k.subset, doublet.scores[,1], "doublet.score")
all.cells <- FetchData(pbmc33k.subset, c("nUMI", "nGene", "percent.mito", "doublet.score"))
plot(all.cells$nGene, all.cells$doublet.score, pch=20, col=rgb(0,0,0,alpha=0.3), cex=0.2, cex.lab=1.4)
We would like to see an increase in gene count when the doublet score a high.
FeaturePlot(pbmc33k.subset, c("doublet.score"), cols.use = c("lightgrey", "blue"))
plot(pbmc33k.subset@tsne.rot$tSNE_1, pbmc33k.subset@tsne.rot$tSNE_2, cex = 0.2, col = ifelse(doublet.scores[,1] > 0.2 ,'red','lightgrey'))
The doublet score doesn’t seem to work that well in some of the Tc and NK cells.
gene.umi.ratio <- all.cells$nUMI / all.cells$nGene
palette <- colorRampPalette(c('white','blue'))
colors <- palette(10)[as.numeric(cut(gene.umi.ratio,breaks = 10))]
plot(pbmc33k.subset@tsne.rot$tSNE_1, pbmc33k.subset@tsne.rot$tSNE_2, cex = 0.2, col = colors)
Here we calculate a SNN doublet score giving cells having little SNN neighbours a high score and cells.
ecdf.snn.sums <- ecdf(snn.sums)
snn.score <- sapply(snn.sums, function(x) 1 - min(1, ecdf.snn.sums(x) * 2))
We do the same for the t-SNE neighbours.
ecdf.tsne.sums <- ecdf(pbmc33k.tsne.rot$neighbours)
tsne.score <- sapply(pbmc33k.tsne.rot$neighbours, function(x) 1 - min(1, ecdf.tsne.sums(x) * 2))
Plot the t-SNE and SNN doublet score in the t-SNE map
par(mfrow=c(1,2))
plot(pbmc33k.subset@tsne.rot$tSNE_1, pbmc33k.subset@tsne.rot$tSNE_2, cex = 0.2,
col = colorRampPalette(c('white','blue'))(10)[cut(tsne.score, breaks = 10)])
plot(pbmc33k.subset@tsne.rot$tSNE_1, pbmc33k.subset@tsne.rot$tSNE_2, cex = 0.2,
col = colorRampPalette(c('white','blue'))(10)[cut(snn.score, breaks = 10)])
par(mfrow=c(1,1))
Calculate a gene count score
ecdf.genes <- ecdf(all.cells$nGene)
hist(all.cells$nGene, breaks = 100)
gene.count.scores <- sapply(all.cells$nGene, function(x) min(1, ecdf.genes(x) * 2))
Combine all the scores
doublet.score.combined <- snn.score * tsne.score * doublet.scores[,1] * gene.count.scores
#doublet.score.combined <- snn.score + tsne.score + doublet.scores[,1] + gene.count.scores / 4
plot(pbmc33k.subset@tsne.rot$tSNE_1, pbmc33k.subset@tsne.rot$tSNE_2, cex = 0.2,
col = colorRampPalette(c('lightgrey','blue'))(10)[cut(doublet.score.combined, breaks = 10)])